import requests
import json
from lxml import etree
import re


class FoxRec(object):
    def __init__(self, base_url):
        self.base_url = base_url
        self.start_url = '{}/{}'.format(base_url, 'Job_listInfo.aspx')  # 列表页的url
        self.headers = self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
        self.request_s = requests.session()
        self.all_info_lists = list()

    def get_detail_info(self, detail_response):
        item = dict()
        detail_tree = etree.HTML(detail_response)

        # 获取招聘信息的各个信息
        item['job_name'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_name"]/text()')[0]
        item['job_type'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_type_name"]/text()')[0]
        item['pubuli_date'] = detail_tree.xpath('//span[@id="ctl00_Content_lblupdate_date"]/text()')[0]
        item['organ_name'] = detail_tree.xpath('//span[@id="ctl00_Content_lbldept_name"]/text()')[0]
        item['request_education'] = detail_tree.xpath('//span[@id="ctl00_Content_education_name"]/text()')[0]
        item['request_major'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_major"]/text()')[0]
        item['request_job_years'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_years"]/text()')[0]
        item['request_count'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_count"]/text()')[0]
        item['request_detail'] = detail_tree.xpath('//span[@id="ctl00_Content_lbljob_promulgator"]/text()')[0]

        print('get_detail_page_info_ok!')
        return item

    def handel_listpage(self, listpage_response):
        # 获取这一页中每条信息的tr
        tr_lists = listpage_response.xpath('//th[text()="职位名称"]/../../tr')[1:]

        # 遍历每个tr,从中提取详情页的url，并访问获取响应
        for tr in tr_lists:
            detail_url = tr.xpath('.//a/@href')[0]
            detail_url = '{}/{}'.format(self.base_url, detail_url)
            # 获取详情页
            detail_response = self.request_s.get(detail_url, headers=self.headers, timeout=60).content.decode()

            item = self.get_detail_info(detail_response=detail_response)  # 将响应交给获取信息的函数

            self.all_info_lists.append(item)

        print('get_page_ok!')

    def handel_info(self, response):
        all_info = list()
        frist_tree = etree.HTML(response)
        max_page = int(re.findall('\d+', frist_tree.xpath('//a[text()="末页"]/@title')[0])[0])
        print(max_page)  # 最大页码

        # 获取第一页中详情页的数据
        self.handel_listpage(listpage_response=frist_tree)

        # 从第一页中获取翻页需要的字段
        input_dict = dict()
        input_name_lists = frist_tree.xpath('//form[@name="aspnetForm"]//input[@name]')
        for input_name in input_name_lists:
            name = input_name.xpath('./@name')[0]
            value = input_name.xpath('./@value')[0] if len(input_name.xpath('./@value')) > 0 else ''
            input_dict[name] = value
        print(input_dict)

        # 翻页
        for i in range(2, max_page):
            # input_dict['__EVENTARGUMENT'] = 2
            input_dict['ctl00$Content$pager_input'] = i
            other_page_reponse = self.request_s.post(self.start_url, data=input_dict, headers=self.headers).content.decode()
            other_page_tree = etree.HTML(other_page_reponse)
            self.handel_listpage(listpage_response=other_page_tree)

    def main(self):
        response = self.request_s.get(self.start_url, headers=self.headers, timeout=60).content.decode()

        self.handel_info(response=response)

        with open('file_json\\fox_rec_info.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.all_info_lists, ensure_ascii=False, indent=2))


if __name__ == '__main__':
    fox = FoxRec('http://hr.foxconn.com/R_Society')
    fox.main()
